In this notebook, we are using the tmb_genomic.tsv file
generated from the 01-preprocess-data.Rmd script.
suppressPackageStartupMessages({
library(tidyverse)
})
# Detect the ".git" folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
scratch_dir <- file.path(root_dir, "scratch")
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
input_dir <- file.path(analysis_dir, "input")
# Input files
tmb_genomic_file <- file.path(scratch_dir, "tmb_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
# File path to plots directory
plots_dir <-
file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
source(paste0(analysis_dir, "/util/function-create-barplot.R"))
source(paste0(analysis_dir, "/util/function-create-dumbbell-plot.R"))
source(paste0(root_dir, "/figures/scripts/theme.R"))
# Read and process tmb_genomic file
df_total <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE)
# Are there any samples with both WGS and WXS?
df_total %>%
unique() %>%
arrange(Kids_First_Participant_ID, experimental_strategy) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = ";"))
# There are, so let's remove these from downstream analyses.
df <- df_total %>%
filter(!experimental_strategy == "WXS") %>%
dplyr:::mutate(patient_id = paste(short_histology, Kids_First_Participant_ID, sep = "_"))
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE)
We will explore TMB per Kids_First_Participant_ID over
time by creating stacked barplots.
# Define parameters for function
ylim = max(df$tmb)
# df
f <- c("Second Malignancy", "Unavailable", "Deceased", "Recurrence", "Progressive", "Diagnosis") # Level df by timepoints
df_plot <- df %>%
dplyr:::mutate(tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f))
# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic-total.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-total.pdf"
p <- create_stacked_barplot(tmb_df = df_plot, ylim = ylim)
pdf(file = fname, width = 15, height = 6)
print(p)
dev.off()
quartz_off_screen
4
Attention: Hypermutant TMB defined as ≥10 Mb, and Ultrahypermutant TMB defined as ≥100 mutations/Mb (https://pubmed.ncbi.nlm.nih.gov/29056344/).
Here, we notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 10) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.
# Filter df
df_plot_filter <- df %>%
filter(!tmb >= 10) %>%
unique() %>%
arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(tumor_descriptor_sum = str_c(tumor_descriptor, collapse = ";")) %>%
filter(!tumor_descriptor_sum %in% c("Diagnosis", "Progressive", "Recurrence")) %>%
dplyr::left_join(df, by = c("Kids_First_Participant_ID", "tumor_descriptor_sum")) %>%
mutate(cancer_group_sum = ifelse(short_histology == "HGAT", "High-grade glioma",
ifelse(short_histology == "LGAT", "Low-grade glioma", "Other cancer group")),
cancer_group_sum = replace_na(cancer_group_sum, "Other"),
tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f)) %>%
drop_na(tmb)
# Define parameters for function
ylim = max(df_plot_filter$tmb)
df_plot_filter <- df_plot_filter
# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic-no-hypermutants.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-no-hypermutants.pdf"
p <- create_stacked_barplot(tmb_df = df_plot_filter, ylim = ylim)
pdf(file = fname, width = 25, height = 8)
print(p)
dev.off()
quartz_off_screen
4
We will explore TMB per cancer group over time by creating dumbbell plots. We classified by using cancer types with the highest number of samples (High- and Low-grade gliomas) versus any other cancer groups.
cancer_groups <- unique(as.character(df_plot_filter$cancer_group_sum))
cancer_groups <- sort(cancer_groups, decreasing = FALSE)
print(cancer_groups)
[1] "High-grade glioma" "Low-grade glioma" "Other cancer group"
for (i in seq_along(cancer_groups)) {
print(i)
df_ct_sub <- df_plot_filter %>%
filter(cancer_group_sum == cancer_groups [i])
if (i == 1) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 8
} else if (i == 2) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 4
} else {
print(cancer_groups [i])
# Define parameters for function
ylim <- 4
}
# Name plots
fname <- paste0(plots_dir, "/", "TMB-genomic-dumbbell", "-", cancer_groups[i], ".pdf")
print(fname)
# Run function
p <- create_dumbbell_ct(tmb_df = df_ct_sub,
ylim = ylim,
ct_id = cancer_groups[i])
pdf(file = fname, width = 18, height = 10)
print(p)
dev.off()
}
[1] 1
[1] "High-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-dumbbell-High-grade glioma.pdf"
[1] 2
[1] "Low-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-dumbbell-Low-grade glioma.pdf"
[1] 3
[1] "Other cancer group"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-dumbbell-Other cancer group.pdf"
for (i in seq_along(cancer_groups)) {
print(i)
df_ct_sub <- df_plot_filter %>%
filter(cancer_group_sum == cancer_groups [i])
if (i == 1) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 260
} else if (i == 2) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 150
} else {
print(cancer_groups [i])
# Define parameters for function
ylim <- 150
}
# Name plots
fname <- paste0(plots_dir, "/", "Mutations-genomic-dumbbell", "-", cancer_groups[i], ".pdf")
print(fname)
# Run function
p <- create_dumbbell_ct_mut(tmb_df = df_ct_sub,
ylim = ylim,
ct_id = cancer_groups[i])
pdf(file = fname, width = 18, height = 10)
print(p)
dev.off()
}
[1] 1
[1] "High-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Mutations-genomic-dumbbell-High-grade glioma.pdf"
[1] 2
[1] "Low-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Mutations-genomic-dumbbell-Low-grade glioma.pdf"
[1] 3
[1] "Other cancer group"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Mutations-genomic-dumbbell-Other cancer group.pdf"
Here, we want to explore the number of mutations per timepoint and biospecimen sample per patient case.
samples <- unique(as.character(df_plot_filter$Kids_First_Participant_ID))
print(samples)
for (i in seq_along(samples)) {
print(i)
tmb_sub <- df_plot_filter %>%
filter(Kids_First_Participant_ID == samples[i])
# Define parameters for function
ylim = max(df_plot_filter$tmb)
# Run function
fname <- paste0(plots_dir, "/", samples[i], "-TMB-barplot.pdf")
print(fname)
p <- create_barplot_sample(tmb_df = tmb_sub,
ylim = ylim,
sid = samples[i])
pdf(file = fname, width = 5, height = 4)
print(p)
dev.off()
}
Let’s try to do the same by using facet_wrap function.
Again, we are missing values in timepoints. Maybe we can divide my
names(samples), adjust ylim per each and create 3-4 panels for each
ylim.
# Define parameters for function
ylim = max(df_plot_filter$mutation_count)
# Run function
fname <- paste0(plots_dir, "/", "TMB-barplot.pdf")
print(fname)
p <- create_barplot_sample_panel(tmb_df = df_plot_filter,
ylim = ylim)
pdf(file = fname, width = 25, height = 20)
print(p)
dev.off()
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] ggthemes_4.2.4 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
[5] dplyr_1.1.1 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
[9] tibble_3.2.1 ggplot2_3.4.0 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] highr_0.10 bslib_0.4.2 compiler_4.2.3 pillar_1.9.0
[5] jquerylib_0.1.4 tools_4.2.3 bit_4.0.5 digest_0.6.31
[9] timechange_0.2.0 jsonlite_1.8.4 evaluate_0.20 lifecycle_1.0.3
[13] gtable_0.3.3 pkgconfig_2.0.3 rlang_1.1.0 cli_3.6.1
[17] parallel_4.2.3 yaml_2.3.7 xfun_0.38 fastmap_1.1.1
[21] withr_2.5.0 knitr_1.42 generics_0.1.3 vctrs_0.6.2
[25] sass_0.4.5 hms_1.1.3 bit64_4.0.5 rprojroot_2.0.3
[29] tidyselect_1.2.0 glue_1.6.2 R6_2.5.1 fansi_1.0.4
[33] vroom_1.6.1 rmarkdown_2.21 farver_2.1.1 tzdb_0.3.0
[37] magrittr_2.0.3 scales_1.2.1 htmltools_0.5.5 colorspace_2.1-0
[41] labeling_0.4.2 utf8_1.2.3 stringi_1.7.12 munsell_0.5.0
[45] cachem_1.0.7 crayon_1.5.2